{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "This file is for FINDING CONTROLS for certain cohort year.\n",
    "The file is constructed mainly in 5 parts, each for each type of control\n",
    "(first 4 represent the controls in the paper, and the last control is the extra one relaxed ver. of Common control)\n",
    "\n",
    "*data required(import files)\n",
    "1. basic\n",
    "2. class\n",
    "3. citing_cited_cleaned\n",
    "4. citing_cited\n",
    "\n",
    "*result(export files)\n",
    "controls#_xxx6_xxx5.csv (#=1~4)\n",
    "e.g. control1_1976_1985\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"Insert the cohort information\"\"\"\n",
    "cohort = 1976"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gc\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cited_month_start = cohort*10000+100\n",
    "cited_month_finish = cohort*10000+1300\n",
    "citing_finish = (cohort+9)*10000+1300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#import basic data files which are used for all types of controls\n",
    "\n",
    "POA = pd.read_csv('basic.csv', sep='*')\n",
    "POA_cited = POA[(POA.isd>cited_month_start) & (POA.isd<cited_month_finish)]\n",
    "POA_citing = POA[(POA.apd>cited_month_start) & (POA.apd<citing_finish)]\n",
    "POA_citing_check = POA[(POA.apd>cited_month_start-10000) & (POA.apd<citing_finish+10000)]\n",
    "\n",
    "PC = pd.read_csv('class.csv', sep='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 1. 3-digit control: citingOCL-first-class-controlOCL-or-XCL-first-class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "130567"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#import citation data(CC is for eliminating self-citations)\n",
    "CCC = pd.DataFrame(columns=['citing','cited'])\n",
    "for lines in pd.read_csv('citing_cited_cleaned.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CCC = pd.concat([CCC,lines])\n",
    "    \n",
    "CC = pd.DataFrame(columns=['citing','cited']) #check용\n",
    "for lines in pd.read_csv('citing_cited.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing_check[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CC = pd.concat([CC,lines])\n",
    "    \n",
    "CCC = pd.merge(CCC,PC[PC.type=='OCL'][['wku','adate','first_class']],left_on='citing',right_on='wku',how='left'); CCC = CCC.drop('wku',axis=1)\n",
    "CCC.columns = ['cited','citing','citing_adate','citing_first_OCL']; len(CCC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "404"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CCC_group =CCC.groupby('citing_first_OCL'); len(CCC_group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "50\n",
      "100\n",
      "150\n",
      "200\n",
      "250\n",
      "300\n",
      "350\n",
      "400\n",
      "450\n",
      "500\n",
      "--- 567.1881346225739 min ---\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "www = pd.DataFrame(columns=['cited','citing', 'citing_control', 'period']); index=0\n",
    "for OCL in CCC_group.groups:\n",
    "    if index%50==0: print(index)\n",
    "    CCC_OCL = CCC_group.get_group(OCL)\n",
    "    temp0= PC[PC.first_class == OCL][['wku','adate']]\n",
    "    \n",
    "    for i, row in CCC_OCL.iterrows():\n",
    "        citing_control = 0\n",
    "        period = 0\n",
    "        citing_list_check = CC[CC.cited == row.cited]\n",
    "        \n",
    "        candidate1 = temp0[(abs(temp0.adate-row.citing_adate)<=30) &\\\n",
    "                           (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "        if not (candidate1.empty): #controls within 1-month(30 days)\n",
    "            candidate1 = candidate1[['wku']].drop_duplicates()\n",
    "            citing_control = int(candidate1.sample(n=1)['wku'])\n",
    "            period = 1\n",
    "            del candidate1\n",
    "        else: # if there is no candidates in 1-month, move on to 3-months window\n",
    "            candidate3 = temp0[(abs(temp0.adate-row.citing_adate)>30) & (abs(temp0.adate-row.citing_adate)<=90) &\\\n",
    "                               (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "            if not (candidate3.empty):\n",
    "                candidate3 = candidate3[['wku']].drop_duplicates()\n",
    "                citing_control = int(candidate3.sample(n=1)['wku'])\n",
    "                period = 3\n",
    "                del candidate3\n",
    "            else: # 6-months window\n",
    "                candidate6 = temp0[(abs(temp0.adate-row.citing_adate)>90) & (abs(temp0.adate-row.citing_adate)<=180) &\\\n",
    "                                   (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "                if not (candidate6.empty):\n",
    "                    candidate6 = candidate6[['wku']].drop_duplicates()\n",
    "                    citing_control = int(candidate6.sample(n=1)['wku'])\n",
    "                    period = 6\n",
    "                    del candidate6\n",
    "        \n",
    "        www.loc[len(www)]=[row.cited, row.citing, citing_control, period]    \n",
    "                \n",
    "    del temp0\n",
    "    gc.collect()\n",
    "    index+=1\n",
    "\n",
    "print(\"--- %s min ---\" % ((time.time() - start_time)/60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#save results\n",
    "\n",
    "A = str(cited_month_start)[:4]\n",
    "B = str(citing_finish)[:4]\n",
    "C = \"controls1_\" + A + \"_\" + B+\".csv\"\n",
    "www.to_csv(C,sep=',')\n",
    "\n",
    "del www, CCC, CC\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#2. Any control : citingOCL-firstsub-class-controlOCL-or-XCL-firstsub-class.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "871757"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CCC = pd.DataFrame(columns=['citing','cited'])\n",
    "for lines in pd.read_csv('citing_cited_cleaned.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CCC = pd.concat([CCC,lines])\n",
    "    \n",
    "CC = pd.DataFrame(columns=['citing','cited']) #check용\n",
    "for lines in pd.read_csv('citing_cited.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing_check[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CC = pd.concat([CC,lines])\n",
    "    \n",
    "CCC = pd.merge(CCC,PC[PC.type=='OCL'][['wku','adate','first_class','fullclass']],left_on='citing',right_on='wku',how='left'); CCC = CCC.drop('wku',axis=1)\n",
    "CCC.columns = ['cited','citing','citing_adate','citing_first_OCL','citing_fullclass_OCL']; len(CCC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "507"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CCC_group =CCC.groupby('citing_first_OCL'); len(CCC_group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "50\n",
      "100\n",
      "150\n",
      "200\n",
      "250\n",
      "300\n",
      "350\n",
      "400\n",
      "450\n",
      "500\n",
      "--- 603.9427474379539 min ---\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "www = pd.DataFrame(columns=['cited','citing', 'citing_control', 'period']); index=0\n",
    "for OCL in CCC_group.groups:\n",
    "    if index%50==0: print(index)\n",
    "    CCC_OCL = CCC_group.get_group(OCL)\n",
    "    temp= PC[PC.first_class == OCL][['wku','adate','fullclass']]\n",
    "    \n",
    "    for i, row in CCC_OCL.iterrows():\n",
    "        citing_control = 0\n",
    "        period = 0\n",
    "        citing_list_check = CC[CC.cited == row.cited]\n",
    "        #leave candidates that share OCL or XCL in 9-digit level\n",
    "        temp0 = temp[temp.fullclass == row.citing_fullclass_OCL]\n",
    "        \n",
    "        candidate1 = temp0[(abs(temp0.adate-row.citing_adate)<=30) &\\\n",
    "                           (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "        if not (candidate1.empty):\n",
    "            candidate1 = candidate1[['wku']].drop_duplicates()\n",
    "            citing_control = int(candidate1.sample(n=1)['wku'])\n",
    "            period = 1\n",
    "            del candidate1\n",
    "        else:\n",
    "            candidate3 = temp0[(abs(temp0.adate-row.citing_adate)>30) & (abs(temp0.adate-row.citing_adate)<=90) &\\\n",
    "                               (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "            if not (candidate3.empty):\n",
    "                candidate3 = candidate3[['wku']].drop_duplicates()\n",
    "                citing_control = int(candidate3.sample(n=1)['wku'])\n",
    "                period = 2\n",
    "                del candidate3\n",
    "            else:\n",
    "                candidate6 = temp0[(abs(temp0.adate-row.citing_adate)>90) & (abs(temp0.adate-row.citing_adate)<=180) &\\\n",
    "                                   (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "                if not (candidate6.empty):\n",
    "                    candidate6 = candidate6[['wku']].drop_duplicates()\n",
    "                    citing_control = int(candidate6.sample(n=1)['wku'])\n",
    "                    period = 3\n",
    "                    del candidate6\n",
    "        \n",
    "        www.loc[len(www)]=[row.cited, row.citing, citing_control, period]    \n",
    "                \n",
    "    del temp\n",
    "    gc.collect()\n",
    "    index+=1\n",
    "\n",
    "print(\"--- %s min ---\" % ((time.time() - start_time)/60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = str(cited_month_start)[:4]\n",
    "B = str(citing_finish)[:4]\n",
    "C = \"controls2_\" + A + \"_\" + B+\".csv\"\n",
    "www.to_csv(C,sep=',')\n",
    "\n",
    "del www, CCC, CC\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#3. Primary control:  citingOCL-firstsub-class-controlOCL-firstsub-class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "871757"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CCC = pd.DataFrame(columns=['citing','cited'])\n",
    "for lines in pd.read_csv('citing_cited_cleaned.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CCC = pd.concat([CCC,lines])\n",
    "    \n",
    "CC = pd.DataFrame(columns=['citing','cited']) #check용\n",
    "for lines in pd.read_csv('citing_cited.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing_check[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CC = pd.concat([CC,lines])\n",
    "    \n",
    "CCC = pd.merge(CCC,PC[PC.type=='OCL'][['wku','adate','first_class','fullclass']],left_on='citing',right_on='wku',how='left'); CCC = CCC.drop('wku',axis=1)\n",
    "CCC.columns = ['cited','citing','citing_adate','citing_first_OCL','citing_fullclass_OCL']; len(CCC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "507"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CCC_group =CCC.groupby('citing_first_OCL'); len(CCC_group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "50\n",
      "100\n",
      "150\n",
      "200\n",
      "250\n",
      "300\n",
      "350\n",
      "400\n",
      "450\n",
      "500\n",
      "--- 556.1078217506408 min ---\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "www = pd.DataFrame(columns=['cited','citing', 'citing_control','period']); index=0\n",
    "for OCL in CCC_group.groups:\n",
    "    if index%50==0: print(index)\n",
    "    CCC_OCL = CCC_group.get_group(OCL)\n",
    "    # Restrict class type to OCL(control should match in primary 9-digit class with citing patent)\n",
    "    temp= PC[(PC.type == 'OCL') & (PC.first_class == OCL)][['wku','adate','fullclass']]\n",
    "    \n",
    "    for i, row in CCC_OCL.iterrows():\n",
    "        citing_control = 0\n",
    "        period = 0\n",
    "        citing_list_check = CC[CC.cited == row.cited]\n",
    "        #leave candidates that share 9-digit OCL with citing patent\n",
    "        temp0 = temp[temp.fullclass == row.citing_fullclass_OCL]\n",
    "        \n",
    "        candidate1 = temp0[(abs(temp0.adate-row.citing_adate)<=30) &\\\n",
    "                           (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "        if not (candidate1.empty):\n",
    "            candidate1 = candidate1[['wku']].drop_duplicates()\n",
    "            citing_control = int(candidate1.sample(n=1)['wku'])\n",
    "            period = 1\n",
    "            del candidate1\n",
    "        else:\n",
    "            candidate3 = temp0[(abs(temp0.adate-row.citing_adate)>30) & (abs(temp0.adate-row.citing_adate)<=90) &\\\n",
    "                               (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "            if not (candidate3.empty):\n",
    "                candidate3 = candidate3[['wku']].drop_duplicates()\n",
    "                citing_control = int(candidate3.sample(n=1)['wku'])\n",
    "                period = 3\n",
    "                del candidate3\n",
    "            else:\n",
    "                candidate6 = temp0[(abs(temp0.adate-row.citing_adate)>90) & (abs(temp0.adate-row.citing_adate)<=180) &\\\n",
    "                                   (~temp0.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "                if not (candidate6.empty):\n",
    "                    candidate6 = candidate6[['wku']].drop_duplicates()\n",
    "                    citing_control = int(candidate6.sample(n=1)['wku'])\n",
    "                    period = 6\n",
    "                    del candidate6\n",
    "        \n",
    "        www.loc[len(www)]=[row.cited, row.citing, citing_control, period]    \n",
    "                \n",
    "    del temp\n",
    "    gc.collect()\n",
    "    index+=1\n",
    "\n",
    "print(\"--- %s min ---\" % ((time.time() - start_time)/60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = str(cited_month_start)[:4]\n",
    "B = str(citing_finish)[:4]\n",
    "C = \"controls3_\" + A + \"_\" + B+\".csv\"\n",
    "www.to_csv(C,sep=',')\n",
    "\n",
    "del www, CCC, CC\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#4. Common control: citingOCL-firstsub-class-controlOCL-firstsub-classSharing_constraint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "CCC = pd.DataFrame(columns=['citing','cited'])\n",
    "for lines in pd.read_csv('citing_cited_cleaned.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CCC = pd.concat([CCC,lines])\n",
    "    \n",
    "CC = pd.DataFrame(columns=['citing','cited']) #check용\n",
    "for lines in pd.read_csv('citing_cited.csv', sep='*', chunksize=1000000):\n",
    "    lines = pd.merge(lines, POA_cited[['wku']], left_on='cited', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    lines = pd.merge(lines, POA_citing_check[['wku']], left_on='citing', right_on='wku', how='inner'); lines = lines.drop('wku',axis=1)\n",
    "    CC = pd.concat([CC,lines])\n",
    "    \n",
    "CCC = pd.merge(CCC,PC[PC.type=='OCL'][['wku','adate','first_class','fullclass']],left_on='citing',right_on='wku',how='left'); CCC = CCC.drop('wku',axis=1)\n",
    "CCC.columns = ['cited','citing','citing_adate','citing_first_OCL','citing_fullclass_OCL']; len(CCC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "CCC_group =CCC.groupby('citing_first_OCL'); len(CCC_group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "start_time = time.time()\n",
    "\n",
    "www = pd.DataFrame(columns=['cited','citing', 'citing_control', 'period']); index=0\n",
    "for OCL in CCC_group.groups:\n",
    "    if index%20==0: print(index, ': ', int((time.time() - start_time)/60), ' min')\n",
    "    CCC_OCL = CCC_group.get_group(OCL)\n",
    "    #list of candidate patents sharing OCL in 9-digit level\n",
    "    temp0= PC[(PC.type == 'OCL') & (PC.first_class == OCL)]['wku']\n",
    "    temp1= PC[PC.wku.isin(list(temp0))][['wku','adate','fullclass','type']]\n",
    "    \n",
    "    for i, row in CCC_OCL.iterrows():\n",
    "        citing_control = 0\n",
    "        period = 0\n",
    "        citing_list_check = CC[CC.cited == row.cited]\n",
    "        #restrict candidates sharing OCL with the citing patent\n",
    "        whether_classOCL_coincide = temp1[(temp1.type == 'OCL') & (temp1.fullclass == row.citing_fullclass_OCL)]['wku']\n",
    "        \n",
    "        candidate1 = temp1[(abs(temp1.adate-row.citing_adate)<=30) &\\\n",
    "                           (temp1.wku.isin(list(whether_classOCL_coincide))) &\\ #control 9-digit OCL = citing 9-digit OCL\n",
    "                           (temp1.fullclass.isin(list(PC[PC.wku == row.cited]['fullclass']))) &\\ \n",
    "                           #share OCL or XCL fullclass with the originating's patent 9-digit OCL or XCL\n",
    "                           (~temp1.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "        if not (candidate1.empty):\n",
    "            candidate1 = candidate1[['wku']].drop_duplicates()\n",
    "            citing_control = int(candidate1.sample(n=1)['wku'])\n",
    "            period = 1\n",
    "            del candidate1\n",
    "        else:\n",
    "            candidate3 = temp1[(abs(temp1.adate-row.citing_adate)>30) & (abs(temp1.adate-row.citing_adate)<=90) &\\\n",
    "                               (temp1.wku.isin(list(whether_classOCL_coincide))) &\\\n",
    "                               #(temp1.fullclass.isin(list(PC[PC.wku == row.citing]['fullclass']))) &\\\n",
    "                               (temp1.fullclass.isin(list(PC[PC.wku == row.cited]['fullclass']))) &\\\n",
    "                               (~temp1.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "            if not (candidate3.empty):\n",
    "                candidate3 = candidate3[['wku']].drop_duplicates()\n",
    "                citing_control = int(candidate3.sample(n=1)['wku'])\n",
    "                period = 3\n",
    "                del candidate3\n",
    "            else:\n",
    "                candidate6 = temp1[(abs(temp1.adate-row.citing_adate)>90) & (abs(temp1.adate-row.citing_adate)<=180) &\\\n",
    "                                   (temp1.wku.isin(list(whether_classOCL_coincide))) &\\\n",
    "                                   #(temp1.fullclass.isin(list(PC[PC.wku == row.citing]['fullclass']))) &\\\n",
    "                                   (temp1.fullclass.isin(list(PC[PC.wku == row.cited]['fullclass']))) &\\\n",
    "                                   (~temp1.wku.isin(list(citing_list_check['citing'])))][['wku']]\n",
    "                if not (candidate6.empty):\n",
    "                    candidate6 = candidate6[['wku']].drop_duplicates()\n",
    "                    citing_control = int(candidate6.sample(n=1)['wku'])\n",
    "                    period = 6\n",
    "                    del candidate6\n",
    "        \n",
    "        www.loc[len(www)]=[row.cited, row.citing, citing_control, period]    \n",
    "                \n",
    "    del temp0, temp1\n",
    "    gc.collect()\n",
    "    index+=1\n",
    "\n",
    "print(\"--- %s hours ---\" % ((time.time() - start_time)/3600))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "A = str(cited_month_start)[:4]\n",
    "B = str(citing_finish)[:4]\n",
    "C = \"controls4_\" + A + \"_\" + B+\".csv\"\n",
    "www.to_csv(C,sep=',')\n",
    "\n",
    "del www, CCC, CC\n",
    "gc.collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
